import time
from requests.exceptions import ConnectTimeout #自定义异常
from requests.exceptions import RequestException

import requests
import json
import  os
from lxml import etree
from requests.exceptions import ConnectionError
from requests.exceptions import ChunkedEncodingError

if __name__=="__main__":

    url="https://collection.rubinmuseum.org/search/china/objects/images?page="
    headers={
                'Accept': 'text/html, */*; q=0.01',
              'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,no;q=0.5,ru;q=0.4',
                'Connection': 'keep-alive',
                'Cookie': 'exp_last_visit=1649033878; exp_last_activity=1649073395; JSESSIONID=5CEBD071B7DF2DC59A00BC58F58EF5DF',
                'DNT': '1',
                'Host': 'collection.rubinmuseum.org',
                'Referer': 'https://collection.rubinmuseum.org/search/china',
               'sec-ch-ua': '\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"100\", \"Microsoft Edge\";v=\"100\"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': '\"Windows\"',
                'Sec-Fetch-Dest': 'empty',
                'Sec-Fetch-Mode': 'cors',
                'Sec-Fetch-Site': 'same-origin',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29',
                'X-Requested-With': 'XMLHttpRequest'
        }
    idcount=124
    for i in range(13,15):
        newurl=url
        param = {
            'page':str(i)
        }
        newurl=url+str(i)
        print(param)
        while True:  # 一直循环，知道访问站点成功
            try:
                pagetext= requests.get(url=newurl, params=param, headers=headers).text
                break
            except ConnectionError as e:  # e=Exception() 定义一个异常对象
                print("ConnectionError")
                time.sleep(3)
            except ChunkedEncodingError as e:
                print("ChunkedEncodingError")
                time.sleep(3)
            except Exception as e:
                print("Exception")
                time.sleep(3)
                time.sleep(3)

        tree = etree.HTML(pagetext)
        list = tree.xpath("//div[@id='timagesview']/div")
        #//*[@id="timagesview"]/div[1]
        i=0
        for li in list:
            i=i+1
            src = li.xpath("../div[" + str(i) + "]/div/div[2]/div[1]/a/@href")
            src1 = "https://collection.rubinmuseum.org" + str(src[0])
            #print(src1)
            while True:  # 一直循环，知道访问站点成功
                try:
                    ptext=requests.get(url=src1,headers=headers).text
                    break
                except ConnectionError as e:  # e=Exception() 定义一个异常对象
                    print("ConnectionError")
                    time.sleep(3)
                except ChunkedEncodingError as e:
                    print("ChunkedEncodingError")
                    time.sleep(3)
                except Exception as e:
                    print("Exception")
                    time.sleep(3)
                    time.sleep(3)
            tree1=etree.HTML(ptext)
            imgsrc=tree1.xpath("//*[@id='mediaZone']/div/img/@src")
            imgsrc1="https://collection.rubinmuseum.org/"+str(imgsrc[0])
            while True:  # 一直循环，知道访问站点成功
                try:
                    imgdata=requests.get(url=imgsrc1,headers=headers).content
                    break
                except ConnectionError as e:  # e=Exception() 定义一个异常对象
                    print("ConnectionError")
                    time.sleep(3)
                except ChunkedEncodingError as e:
                    print("ChunkedEncodingError")
                    time.sleep(3)
                except Exception as e:
                    print("Exception")
                    time.sleep(3)
                    time.sleep(3)
            if not os.path.exists("./img"):
                os.mkdir('./img')
            imgpath='img/'+str(idcount)+'.jpg'
            with open(imgpath,'wb')as fp:
                fp.write(imgdata)
                print(str(idcount)+'!!!!')
            #imgname=tree1.xpath("//*[@id='mediaZone']/div/img/@alt")

            imgname=tree1.xpath("//*[@id='detailView']/div/div[2]/div/div/div[1]/h1//text()")
            print(imgname[0])
            dict={}
            dict['id']=idcount
            idcount=idcount+1
            dict['name']=str(imgname[0])
            #

            imgtext = tree1.xpath("//*[@id='detailView']/div/div[2]/div/div/div")
            cname=tree1.xpath("//*[@id='detailView']/div/div[2]/div/div/div[6]/div/span//text()")
            cclass=tree1.xpath("//*[@id='detailView']/div/div[2]/div/div/div[6]/ul//text()")
            if str(cname)!='[]':
                dict[str(cname[0])] = str(cclass)
            #print(str(cname[0])+":"+str(cclass))

            for imgdiv in imgtext:
                k=imgdiv.xpath("./span[1]//text()")
                w=imgdiv.xpath("./span[2]//text()")
                if(str(k)!="[]" and str(w)!="[]"):
                    #print(str(k[0]) + ":" + str(w[0]))
                    dict[str(k[0])]=str(w[0])
            print(dict)
            json_str = json.dumps(dict)
            filepath="json/"+str(idcount-1)+".json"
            with open(filepath, 'w') as json_file:
                json_file.write(json_str)

            #print(imgname)

    print("#######################################")


